This notebook contains the part of the project related to the implementation of DETR algorithm to deal with the object detection task on TrashCan dataset.
After a first part to load the dataset, preprocess the images and construct a DataLoader object from it, a second part is dedicated to the re-implmentation of the DETR algorithm. We re implemented the architecture of the model and all the functions needed to train it by ourself on TrashCan dataset. A third part test if we can achieve better performance by starting from a pre-trained DETR model, which is re trained on TrashCan dataset. The last part aims at implementing the computation of the mAP metric, and to compute this metric on our DETR models to be able to compare their performance to the one of other models.
#J'installe une version plus récente parce que je crois qu'on a des soucis de compatibilité plus bas
#https://discuss.pytorch.org/t/runtimeerror-each-element-in-list-of-batch-should-be-of-equal-size/91737
!pip install torch==1.5.0
!pip install torchvision==0.6.0
!pip install fiftyone
!pip install pycocotools
import torch
import torch.nn as nn
from torchvision import transforms
import matplotlib.pyplot as plt
import matplotlib.patches as patches #In order to draw the box !
import numpy as np
!pip install pytorch_pretrained_vit
from pytorch_pretrained_vit import ViT
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
CUDA_LAUNCH_BLOCKING=1.
from torchvision import models
device
The Trashcan dataset can directly be downloaded from their website. We will focus on the "instances" version of the dataset, which already contains predefined train and test sets. For the train set for instance, the annotations for the images are in the files instances_train_trashcan.json and the images are in a folder train.
All the annotations follow the COCO format, which is detailed here : https://cocodataset.org/#format-data
!wget "https://conservancy.umn.edu/bitstream/handle/11299/214865/dataset.zip?sequence=12&isAllowed=y"
!unzip "dataset.zip?sequence=12&isAllowed=y"
from torchvision import datasets
from torch.utils.data import DataLoader
# The directory containing the source images
data_path = "dataset/instance_version/train"
# The path to the COCO format labels JSON file
labels_path = "dataset/instance_version/instances_train_trashcan.json"
import torch.utils.data as data
from PIL import Image
import os
import os.path
We wanted to use COCO's API via torchvision.datasets.CocoDetection (https://pytorch.org/vision/stable/datasets.html), and we had to custom some functions to obtain the format of dataloader we wanted.
We resized images, kept only the boxes and category ids of our targets, and stored them in an array of dictionnaries.
The idea of the above custom functions is to be able to manage the fact that for each image the number of targets is different, and we didn't wanted a single tensor with a lot of empty objects to avoid useless time consumption.
class CocoDetection_diy_bis(data.Dataset) :
"""`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
Args:
root (string): Root directory where images are downloaded to.
annFile (string): Path to json annotation file.
resize : (int,int) size of the images wanted
"""
def __init__(self, root, annFile, size):
from pycocotools.coco import COCO
self.root = root
self.coco = COCO(annFile)
self.ids = list(self.coco.imgs.keys())
self.size = size
self.transform = transforms.Compose([transforms.Resize(size), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])])
def __getitem__(self, index):
"""
Args:
index (int): Index
Returns:
tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
"""
coco = self.coco
img_id = self.ids[index]
ann_ids = coco.getAnnIds(imgIds=img_id)
target = coco.loadAnns(ann_ids)
path = coco.loadImgs(img_id)[0]['file_name']
# Resize des images :
img = Image.open(os.path.join(self.root, path)).convert('RGB')
original_size = img.size
img = self.transform(img)
# Targets dict :
targets = {'labels':[],'boxes':[]}
for elem in target :
box = np.copy(elem['bbox'])
box[0] /= original_size[0] #* self.size[0]
box[1] /= original_size[1] #* self.size[1]
box[2] /= original_size[0] #* self.size[0]
box[3] /= original_size[1] #* self.size[1]
targets['boxes'].append(box)
targets['labels'].append(elem['category_id']-1) # -1 because in this dataset the category ids start to 1
return img, targets
def __len__(self):
return len(self.ids)
def __repr__(self):
fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
fmt_str += ' Number of datapoints: {}\n'.format(self.__len__())
fmt_str += ' Root Location: {}\n'.format(self.root)
tmp = ' Transforms (if any): '
fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
tmp = ' Target Transforms (if any): '
fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
return fmt_str
instances_train_4 = CocoDetection_diy_bis(root = data_path, annFile = labels_path, size=(224,224))
def collate_fn_diy (batch) :
"""
Parameters :
-----------
batch : list of tuples (img,targets)
Return :
-------
images : tensor of dim batch_size x 3 x 224 x 224
targets : list of dict containing :
- "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels
- "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
"""
imgs, trgts = list(zip(*batch)) # imgs et trgts sont désormais des batch_size-tuples
imgs = [img.unsqueeze(0) for img in list(imgs)] #ajout d'une dimension supplémentaire à tous les tenseurs
images = torch.cat(imgs) # concaténation en un seul tenseur
targets = []
for t in list(trgts) :
targets.append({'labels' : torch.from_numpy(np.array(t["labels"])),
'boxes' : torch.from_numpy(np.array(t["boxes"]))})
return images, targets
# Format DataLoader :
instances_train_dataloader_4 = DataLoader(instances_train_4, batch_size=5, shuffle=True, collate_fn = collate_fn_diy)
INSTANCES_CLASSES = ['rov','plant', 'animal_fish', 'animal_starfish', 'animal_shells', 'animal_crab', 'animal_eel', 'animal_etc',
'trash_clothing', 'trash_pipe', 'trash_bottle', 'trash_bag', 'trash_snack_wrapper','trash_can', 'trash_cup',
'trash_container','trash_unknown_instance', 'trash_branch', 'trash_wreckage', 'trash_tarp', 'trash_rope','trash_net']
import matplotlib.patches as patches
inv_normalize = transforms.Normalize(
mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255],
std=[1/0.229, 1/0.224, 1/0.255]
)
def plot_bounding_boxes(train_images, train_labels, idx_image, category_labels = INSTANCES_CLASSES, size = (224,224)) :
# Visualization of the bounding boxes of image idx_image in a set of images
fig,ax = plt.subplots(1)
# Display the image :
img = train_images[idx_image].squeeze()
img_unnormalized = inv_normalize(img)
ax.imshow(transforms.functional.to_pil_image(img_unnormalized))
_,l,L = img.shape
# Plot bounding boxes & category_id :
labels, boxes = train_labels[idx_image]['labels'].numpy(), train_labels[idx_image]['boxes'].numpy()
print()
for k, category_id in enumerate(labels) :
box = boxes[k]
xy = (box[0]*l , box[1]*L)
rect = patches.Rectangle(xy , box[2]*l ,box[3]*L, linewidth=1, edgecolor='r', facecolor='none')
ax.add_patch(rect)
ax.text(xy[0], xy[1], category_labels[category_id], fontsize=8, bbox=dict(facecolor='r', alpha=0.5))
plt.show()
Rq :
In the FB demos, I realized that in practice, outside of the training procedure, the predictions are kept only if their probability/score is superior to a certain proba (0.7 in their examples), so I added a parameter keep_pred for that purpose.
inv_normalize = transforms.Normalize(
mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255],
std=[1/0.229, 1/0.224, 1/0.255]
)
def plot_bounding_boxes_predictions (images, predictions, idx_image, category_labels = INSTANCES_CLASSES, keep_pred = 0.7) :
fig,ax = plt.subplots(1)
# Display the image :
img = images[idx_image].squeeze()
img_unnormalized = inv_normalize(img)
ax.imshow(transforms.functional.to_pil_image(img_unnormalized))
_,l,L = img.shape
# Plot bounding boxes & category_id :
labels_probas, boxes = predictions['pred_logits'][idx_image].cpu().softmax(-1).numpy(), predictions['pred_boxes'][idx_image].cpu().numpy()
labels = labels_probas.argmax(-1)
labels_prob = labels_probas.max(-1)
for k, category_id in enumerate(labels) :
if category_id != 22 and labels_prob[k] > keep_pred :
box = boxes[k]
xy = (box[0]*l , box[1]*L)
rect = patches.Rectangle(xy , box[2]*l ,box[3]*L, linewidth=1, edgecolor='r', facecolor='none')
ax.add_patch(rect)
ax.text(xy[0], xy[1], category_labels[category_id], fontsize=8, bbox=dict(facecolor='r', alpha=0.5))
plt.show()
import pandas as pd
def description(instances, category_labels = INSTANCES_CLASSES) :
#Dataloader for train images with only one image per batch (for simplicity) :
instances_dataloader_1 = DataLoader(instances, batch_size=1, shuffle=True, collate_fn = collate_fn_diy)
# We will plot the number of objects of each category, and the boxplot of their sizes :
nb_classes = len(category_labels)
objects = []
objects_image = []
objects_shape = []
for batch_id, (images, labels) in enumerate(instances_dataloader_1):
boxes_gt, labels_gt = labels[0]['boxes'].numpy(), labels[0]['labels'].numpy() # arrays of shape : num_objects x 4 and num_objects
for k, label in enumerate(labels_gt) :
objects.append(label)
objects_shape.append(boxes_gt[k][-1] * boxes_gt[k][-2])
objects_image.append(batch_id)
desc = pd.DataFrame({'category_id' : objects, 'area' : objects_shape, 'image_idx' : objects_image})
# Adding the labels :
df_labels = pd.DataFrame({'category_id' : [i for i in range(nb_classes)], 'category' : category_labels})
desc = desc.join(df_labels.set_index('category_id'), on='category_id')
# Plots :
# Proportion of each objects in the dataset :
fig,ax = plt.subplots(1,2, figsize=(14,5))
desc_count = desc[['category','category_id','area']].groupby('category').agg({'category_id':'min', 'area' :'count'}).reset_index()
desc_count['area'] *= 100 / desc_count['area'].sum()
# desc_count = desc_count.sort_values('category_id', ascending=False)
desc_count.plot(x='category',y='area', ylabel='count', kind='barh', legend=False, ax =ax[0])
ax[0].set_xlabel('proportion of objects (%)')
# Shape of objects :
desc.boxplot('area', by='category', vert=False, grid=False, showfliers=False, ax=ax[1])
ax[1].set_xlabel('area (normalized)')
plt.show()
# Number of objects per image :
fig2,ax2 = plt.subplots()
desc_sum = desc[['image_idx','area']].groupby('image_idx').count().reset_index()
desc_sum.boxplot('area', vert=False, grid=False, ax=ax2)
ax2.set_xlabel('number of objects per image')
plt.show()
print('Mean number of objects per image :',desc_sum['area'].mean())
instances_train_1 = CocoDetection_diy_bis(root = data_path, annFile = labels_path, size=(224,224))
print('Description of the train set :')
description(instances_train_1)
data_path_val = "dataset/instance_version/val"
labels_path_val = "dataset/instance_version/instances_val_trashcan.json"
instances_val_1 = CocoDetection_diy_bis(root = data_path_val, annFile = labels_path_val, size=(224,224))
print('Description of the validation set :')
description(instances_val_1)
Our goal was to re implement the DETR architecture and the functions needed to train it by ourself on TrashCan dataset.
Note that, to help us, we read and got inspiration from some notebooks with tutorials available on the github corresponding to the article, including a Notebook with a minimal version of DETR for beginners, and some other implementation inspired from the FB one .
Backbone : We take a pre trained Resnet50, which has an output of size $2048$. In practice, from the implementations I got inspired from, this size is reduced to a smaller dimansion hidden_dim ($256$ per default) by default by a CNN.
Position encoding : In the FB last version of the DETR, the Positional Encoding is learned. Here, we will just focus on a simpler encoding, using the Sine encoding seen in course : if we want to encode in $d$ dimensions (tq $d \equiv_2 0$) a position $t \in \mathbb{N}$, we have : $$(f(t))_i = \left{ \begin{array}{ll}
\sin(w_k t) & \mbox{si } i = 2k \\
\cos(w_k t) & \mbox{si } i=2k+1
\end{array} \right., \mbox{avec } w_k = \frac{1}{10000^{2k/d}} $$
Transformer : To simplify, a transformer's architecture already implemented in pytorch was used via : nn.Transformer
Prediction head : predictions heads for the class and the bbox are two (independent) linear networks.
import math
class DETR (torch.nn.Module) :
def __init__ (self, num_classes, retrain_resnet = False, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6, N = 100, device='cpu') :
'''
Parameters :
-----------
num_classes : int
nombre de catégories dans le jeu de données (sans compter la classe vide, qui sera rajoutée dans l'algo)
retrain_reset : bool (default False)
si False, les paramètres de l'extracteur de features resnet seront fixés, et il ne sera pas updaté pendant le training (peut permettre de gagner du temps)
hidden_dim : int (default 256)
dimension souhaitée pour la taille des features de l'image (sortie du backbone)
nheads, num_encoder_layers, num_decoder_layers : int (default 8,6,6)
paramètres du transformer
N : int (default 100)
size of the set that will be predicted by the transformer (before bipartite matching loss)
'''
super(DETR, self).__init__()
self.device = device
self.num_classes = num_classes
resnet50 = models.resnet50(pretrained=True)
# Backbone : CNN du resnet (batch_size x 3 x H_0 x W_0 -> batch_size x 2048 x H x W) + Conv2D (batch_size x 2048 x H x W -> batch_size x hidden_dim x H x W)
self.resnet50_features = nn.Sequential(*(list(resnet50.children())[:-2]))
if not retrain_resnet :
for param in self.resnet50_features.parameters():
param.requires_grad = False # Fixe les paramètres du resnet50 dans le cas où retrain_resnet==False
self.conv = nn.Conv2d(2048, hidden_dim, 1)
# Positionnal encoding for the features : (batch_size x hidden_dim x H x W -> H*W x batch_size x hidden_dim)
# Positional encoding sine 2D :
d_model, length = hidden_dim // 2, N // 2
self.row_embed = torch.zeros(length, d_model)
self.col_embed = torch.zeros(length, d_model)
position_row = torch.arange(0, length).unsqueeze(1)
position_col = torch.arange(0, length).unsqueeze(1)
w = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
self.row_embed[:, 0::2] = torch.sin(position_row.float() * w)
self.row_embed[:, 1::2] = torch.cos(position_row.float() * w)
self.col_embed[:, 0::2] = torch.sin(position_col.float() * w)
self.col_embed[:, 1::2] = torch.cos(position_col.float() * w)
# For the queries of the transformer :
self.query_pos = nn.Parameter(torch.rand(N, hidden_dim)) # 1 x N x hidden_dim
# Transformer :
self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
# Prediction head to put after the transformer :
self.linear_class = nn.Linear(hidden_dim, num_classes + 1) # num_classes + 1 to add the "No object" fictive class
self.linear_bbox = nn.Linear(hidden_dim, 4)
def forward (self, inputs) :
# inputs shape : 3 x 224 x 224 ou batch_size x 3 x 224 x 224
if len(inputs.shape) == 3 : #rajoute une dimension dans le cas où x ne contient qu'une seule image
inputs = inputs.unsqueeze(0) # x shape : 1 x 3 x 224 x 224
# Features extraction : backbone + conv2d to have the features of the inputs
x = self.resnet50_features(inputs) # x shape : batch_size x 2048 x H x W où H=W=7 ici
h = self.conv(x) #h shape : batch_size x hidden_dim x H x W
# Positionnal encoding :
H, W = h.shape[-2:]
pos = torch.cat([
self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
], dim=-1).flatten(0, 1).unsqueeze(1) # pos shape : H*W x 1 x hidden_dim
queries = self.query_pos.unsqueeze(1)
batch_size = inputs.shape[0]
if batch_size != 1 :
queries = torch.repeat_interleave(queries, batch_size, dim=1) # shape N x batch_size x hidden_dim
# Transformer :
queries = queries.to(self.device)
keys = (pos.to(self.device) + 0.1 * h.to(self.device).flatten(2).permute(2, 0, 1))
keys = keys.to(self.device)
h = self.transformer(keys, queries).transpose(0,1) # batch_size x N x hidden_dim
# Predictions heads pour obtenir une bbox et une prédiction de classe :
pred_logits = self.linear_class(h) # batch_size x N x (num_classes + 1)
pred_boxes = self.linear_bbox(h).sigmoid() # batch_size x N x 4
return {'pred_logits': pred_logits,
'pred_boxes': pred_boxes}
To train the DETR, we need to define some functions to be able to compare predictions to targets. We will need :
with : $\lambda_{iou}, \lambda_{L1} \in \mathbb{R}$ hyperparameters, and $\mathcal{L}_{iou}(b_1, b_2) = 1 - \left( \frac{|b_1 \cap b_2 |}{|b_1 \cup b_2 |} - \frac{B(b_1,b_2) \setminus b_1 \cup b_2}{B(b_1,b_2)} \right)$, with $B(b_1,b_2)$ corresponding to the smaller box that would contains both $b_1$ and $b_2$.
def bbox_cxcywh_to_xyxy(x):
# Fonction reprise du git FB : renvoie la bbox sous forme de deux points,
# qui correspondent au coin inférieur gauche et supérieur droit
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=-1)
def loss_iou (b1, b2) :
"""
Parameters :
-----------
b1, b2 : tensor qui contient des bounding boxes au format [c_x, c_y, w, h] (size : batch_size x num_queries (N) x 4)
b1 correspond aux ground truth et b2 aux prédictions associées
Output :
-------
Matrice de taille len(b1) x len(b2), qui contient dans la case i,j la loss_iou entre b1_i, b2_j
"""
# b1,b2 = b1.cuda(), b2.cuda()
area1, area2 = b1[:,2]*b1[:,3], b2[:,2]*b2[:,3]
b1_xyxy, b2_xyxy = bbox_cxcywh_to_xyxy(b1), bbox_cxcywh_to_xyxy(b2)
# On garde le plus grand coin inf gauche, et le plus petit coin sur droit :
max_inf_gauche = torch.max(b1_xyxy[:, None, :2], b2_xyxy[:, :2])
min_inf_gauche = torch.min(b1_xyxy[:, None, :2], b2_xyxy[:, :2])
max_sup_droit = torch.max(b1_xyxy[:, None, 2:], b2_xyxy[:, 2:])
min_sup_droit = torch.min(b1_xyxy[:, None, 2:], b2_xyxy[:, 2:])
# intersection area :
# correspond à l'aire de la boite [max_inf_gauche,min_sup_droit] = [(0,0) , max(min_sup_droit - max_inf_gauche ; (0,0))]
inter_sup_droit_normalise = (min_sup_droit - max_inf_gauche).clamp(min=0) # si b1 et b2 ont une intersection vide, on aura min_sup_droit - max_inf_gauche < 0, et le clamp permet de fixer l'intersection à 0
inter_area = inter_sup_droit_normalise[:, :, 0] * inter_sup_droit_normalise[:, :, 1]
# union :
# on utilise la formule aire_union = aire_1 + aire_2 - aire_inter
union_area = area1[:,None] + area2 - inter_area
# La plus petite boite qui couvrirait b1 et b2 correspond à :
# [min_inf_gauche,max_sup_droit] = [(0,0) , max(max_sup_droit - min_inf_gauche ; (0,0))]
B_sup_droit_normalise = (max_sup_droit - min_inf_gauche).clamp(min=0)
B_area = B_sup_droit_normalise[:, :, 0] * B_sup_droit_normalise[:, :, 1]
# On a désormais tous les ingrédients pour calculer L_iou :
L_iou = 1 - (inter_area/union_area) + ((B_area - union_area)/B_area)
return L_iou
def loss_bbox (b1,b2, lambda_iou, lambda_1) :
"""
Parameters :
-----------
b1, b2 : tensor qui contient des bounding boxes au format [c_x, c_y, w, h] (example size : batch_size x num_queries (N) x 4)
b1 correspond aux ground truth et b2 aux prédictions associées
Output :
-------
Matrice de taille len(b1) x len(b2), qui contient dans la case i,j la loss_bbox entre b1_i, b2_j
"""
L_iou = loss_iou(b1,b2)
L_1 = torch.cdist(b1, b2, p=1)
return lambda_iou * L_iou + lambda_1 * L_1
The bipartite matching was coded in a class. The idea is to compute cost matrices between predictions and ground truths. We then try to assigne one prediction to each ground truth without repetitions. It is similar to a hungarian assignment problem.
The idea is to start by computint a matrice that would represent the "costs" between each prediction i and ground truth j. We then use an already existing matching function, implemented in scipy.optimize.linear_sum_assignment, which resolves the affectation problem for linear sums, that is to say to try to minimise the sum of the costs of all affectations.
from scipy.optimize import linear_sum_assignment
# linear_sum_assignment : prend en argument une cost_matrix (array), et renvoie row_ind, col_ind (arrays), qui correspondent à la meilleure affectation de somme linéaire
class Bipartite_Matching (nn.Module) :
def __init__ (self, lambda_category, lambda_iou, lambda_1, device='cpu') :
"""
Parameters :
-----------
lambda_category, lambda_iou, lambda_1 : float
correspondent aux hypermaramètres utilisés pour "pondérer" les 3 différentes loss dans la combinaison linéaire permettant d'obtenir la loss finale
"""
super().__init__()
self.device = device
self.lambda_category = lambda_category
self.lambda_iou = lambda_iou
self.lambda_1 = lambda_1
@torch.no_grad()
def forward(self, predictions, ground_truth) :
"""
Parameters :
------------
predictions : dict
contient : "pred_logits": Tensor of dim [batch_size, num_queries (N), num_classes] with the classification logits
"pred_boxes": Tensor of dim [batch_size, num_queries (N), 4] with the predicted box coordinates
ground_truth : list de taille batch_size
contient des dictionnaires avec : "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth objects in the target) containing the class labels
"boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
Output :
-------
matching sous forme d'une liste de taille batch_size, qui contient des tuples(i,j) associant la prédiction i au ground truth j
"""
batch_size, num_queries = predictions["pred_logits"].shape[:2]
# On réduit la dimension des prédictions et on concatène tous les gound truth pour avoir des dimensions compatibles :
predictions_prob = predictions["pred_logits"].flatten(0, 1).softmax(-1).float() # [batch_size * num_queries, num_classes]
predictions_bbox = predictions["pred_boxes"].flatten(0, 1).float() # [batch_size * num_queries, 4]
ground_truth_category = torch.cat([gt["labels"].type(torch.long) for gt in ground_truth]) # [batchsize, num_ground_truth]
ground_truth_bbox = torch.cat([gt["boxes"] for gt in ground_truth]).float() # [batchsize, 4]
# Calcul des différentes parties de la loss :
L_category = - self.lambda_category * predictions_prob[:, ground_truth_category] # proba de la classe du ground_truth dans la prédiction, [batch_size * num_queries, num_ground_truth]
L_bbox = loss_bbox(ground_truth_bbox.to(device), predictions_bbox.to(device), self.lambda_iou, self.lambda_1) # [ num_ground_truth, batch_size * num_queries]
L = L_category.permute(1,0) + L_bbox # [ num_ground_truth, batch_size * num_queries]
# On utilise enfin la fonction linear_sum_assignment pour obtenir les meilleurs affectations prédiction-ground truth :
sizes_ground_truth = [len(gt["boxes"]) for gt in ground_truth] # nb de ground truth pour chaque image du batch, [batch_size]
indices_affectations = []
#loss_affectations = []
for i,c in enumerate(L.split(sizes_ground_truth, 0)) :
L_i = c[:,i*num_queries:(i+1)*num_queries]
row_ind, col_ind = linear_sum_assignment(L_i.cpu())
#loss_affectations.append((L_i[row_ind, col_ind].sum()))
indices_affectations.append((torch.as_tensor(row_ind, dtype=torch.int64),torch.as_tensor(col_ind, dtype=torch.int64)))
return indices_affectations #, loss_affectations
Now that the bipartite matching is done, that is to say that we have found a permutation $\sigma$ that realizes the best possible matching targets / predictions, we can calculate the final loss function :
$$\mathcal{L}_{Hungarian}(y,\widehat{y},\sigma) = \sum_{i=1}^{N} \left[ - \log (\widehat{p}_{\sigma(i)}(c_i)) + \mathbb{1}_{\{c_i \neq -1\}} \mathcal{L}_{bbox}(b_i, \widehat{b}_{\sigma(i)}) \right]$$With $y$ the targets, $\widehat{y}=\{\widehat{y_i}\}_{1\leq i\leq N}$ the prédictions and $\sigma$ the permutation obtained with the bipartite matching step.
In particular, the article recommands to underweight of a 10 factor the log-probabilities of the empty class predictions, as $N$ has been volontarily taken too high and the proportion of empty objets in the predictions will always be really high.
import torch.nn.functional as F
def hungarian_loss (predictions, targets, indices_affectations, num_classes, lambda_iou, lambda_1, device='cpu', downweight_empty = 1/23) :
# num_classes : nombre de classes, sans compter l'objet vide
batch_size, N = predictions['pred_logits'].shape[:2]
# Pour les catégories, on crée un tenseur de taille batch_size x num_queries (x 4) où on trouve les classes "targets" de chaque prédiction (par défaut num_class et non -1 !)
# Pour les boxes, on va seulement conserver les listes de boxes targets et prédictions (la loss ne concerne pas les predictions de box pour lesquelles la classe vaut -1)
targets_class = torch.full((batch_size,N), num_classes, dtype=torch.int64, device=device)
predictions_boxes = []
targets_boxes = []
for k,dic in enumerate(labels) :
boxes_k = dic["boxes"].to(device) # tensor size num_target*4
labels_k = dic["labels"].to(device) # tensor size num_target
indices_target, indices_pred = indices_affectations[k] # tensors of size num_target
if indices_target.shape[0] != 0 :
targets_class[k][indices_pred] = labels_k[indices_target]
targets_boxes.append(boxes_k[indices_target])
predictions_boxes.append(predictions['pred_boxes'][k][indices_pred])
targets_boxes = torch.cat(targets_boxes)
predictions_boxes = torch.cat(predictions_boxes)
# Loss sur les class : correspond à Cross Entropy loss implémentée dans torch.nn.functional.cross_entropy
empty_weights = torch.ones(num_classes + 1, device=device) # tensor of size (num_classes+1) weight of each class
empty_weights[-1] = downweight_empty
L_ce = F.cross_entropy(predictions['pred_logits'].transpose(1, 2), targets_class, empty_weights)
# Loss sur les boxes : on réutilise la loss précédemment implémentée :
L_bbox = loss_bbox(targets_boxes.type(torch.float32), predictions_boxes, 1, 1)
L_bbox = torch.diag(L_bbox).mean() # Dans le papier il y a une somme, mais cross_entropy renvoie une moyenne par défaut, donc cela évitera une trop grande diff avec la valeur de L_ce
return L_ce + L_bbox
We are now ready to train our own DETR models.
We have trained different versions of the algorithm (not all presented here), choosing to froze or not the Resnet backbone and modifying the hyper-parameters :
Classic training hyper-parameters :
Hyper parameters related to the DETR architecture :
downweight_empty one. This parameter corresponds to the way we down weight the part of the loss related to the score predicted to the category of empty objects. We have seen in our tests that this hyper-parameter was really impacting the quality of the predictions.!pip install tqdm
from tqdm import tqdm
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device
# Hyper-parameters :
NB_EPOCHS = 250
BATCH_SIZE = 32
NB_INSTANCES_CLASSES = 22
LEARNING_RATE = 0.00001
DOWNWEIGHT_EMPTY = 1/23
LAMBDA_CATEGORY = 1
LAMBDA_IOU = 1
LAMBDA_1 = 1
# Training images :
instances_train = CocoDetection_diy_bis(root = data_path, annFile = labels_path, size=(224,224))
instances_train_dataloader = DataLoader(instances_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn = collate_fn_diy)
NB_TRAIN_IMAGES = len(instances_train_4)
print("Nb of images in the train set : ",NB_TRAIN_IMAGES)
# Validation images :
data_path_val = "dataset/instance_version/val"
labels_path_val = "dataset/instance_version/instances_val_trashcan.json"
instances_val = CocoDetection_diy_bis(root = data_path_val, annFile = labels_path_val, size=(224,224))
instances_val_dataloader = DataLoader(instances_val, batch_size=5, shuffle=True, collate_fn = collate_fn_diy)
NB_VAL_IMAGES = len(instances_val)
NB_VAL_IMAGES
print("Nb of images in the validation set : ",NB_VAL_IMAGES)
detr = DETR(num_classes=NB_INSTANCES_CLASSES, retrain_resnet=False, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6, N = 100,device=device)
detr = detr.to(device)
bipartite_matcher = Bipartite_Matching(lambda_category=LAMBDA_CATEGORY, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device= device)
optimizer = torch.optim.Adam(detr.parameters(), lr=LEARNING_RATE)
detr_learnable_params = 0
for p in detr.parameters() :
if p.requires_grad :
detr_learnable_params += p.numel()
print('Total number of learnable parameters : ', detr_learnable_params)
for p in detr.parameters() :
assert p.is_cuda
training_losses = []
validation_losses = []
for epoch in tqdm(range(NB_EPOCHS)) :
print("-"*50)
print("Epoch :",epoch)
training_loss = 0.
for batch_id, (images, labels) in enumerate(instances_train_dataloader):
optimizer.zero_grad()
images = images.to(device)
# Prédictions du modèle :
predictions = detr(images)
# Loss :
indices_affectations = bipartite_matcher.forward(predictions, labels)
loss = hungarian_loss(predictions, labels, indices_affectations, num_classes=NB_INSTANCES_CLASSES, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device=device, downweight_empty=DOWNWEIGHT_EMPTY)
# Calcul de gradient :
loss.backward()
# 1 optimization step :
optimizer.step()
# Training loss :
training_loss += loss.item() * len(labels)
if batch_id % 20 == 0 :
print(f'Training loss on batch_id {batch_id} : {loss.item()}')
# Validation loss :
val_loss = 0.
with torch.no_grad() :
for batch_id, (images, labels) in enumerate(instances_val_dataloader):
images = images.to(device)
# Prédictions du modèle :
predictions = detr(images)
# Loss :
indices_affectations = bipartite_matcher.forward(predictions, labels)
loss = hungarian_loss(predictions, labels, indices_affectations, num_classes=NB_INSTANCES_CLASSES, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device=device, downweight_empty=DOWNWEIGHT_EMPTY)
val_loss += loss.item() * len(labels)
validation_losses.append(val_loss / NB_VAL_IMAGES)
training_losses.append(training_loss / NB_TRAIN_IMAGES)
print(f'=> Training loss at epoch {epoch} :', training_losses[-1])
print(f'=> Validation loss at end of epoch {epoch} :', validation_losses[-1])
epochs = [u for u in range(NB_EPOCHS)]
plt.plot(epochs, validation_losses, label = 'Validation Losses', color = 'orange')
plt.plot(epochs, training_losses, label = 'Training Losses', color = 'blue')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
torch.save(obj=detr.state_dict(), f="detr_models/detr_{}epochs_{}batchsize_frozen.pt".format(NB_EPOCHS, BATCH_SIZE))
detr = DETR(num_classes=NB_INSTANCES_CLASSES, retrain_resnet=True, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6, N = 100,device=device)
detr = detr.to(device)
bipartite_matcher = Bipartite_Matching(lambda_category=LAMBDA_CATEGORY, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device = device)
optimizer = torch.optim.Adam(detr.parameters(), lr=LEARNING_RATE)
for p in detr.parameters() :
assert p.is_cuda
detr_learnable_params = 0
for p in detr.parameters() :
if p.requires_grad :
detr_learnable_params += p.numel()
print('Total number of learnable parameters : ', detr_learnable_params)
training_losses = []
validation_losses = []
for epoch in tqdm(range(NB_EPOCHS)) :
print("-"*50)
print("Epoch :",epoch)
training_loss = 0.
for batch_id, (images, labels) in enumerate(instances_train_dataloader):
optimizer.zero_grad()
images = images.to(device)
# Prédictions du modèle :
predictions = detr(images)
# Loss :
indices_affectations = bipartite_matcher.forward(predictions, labels)
loss = hungarian_loss(predictions, labels, indices_affectations, num_classes=NB_INSTANCES_CLASSES, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device=device, downweight_empty=DOWNWEIGHT_EMPTY)
# Calcul de gradient :
loss.backward()
# 1 optimization step :
optimizer.step()
# Training loss :
training_loss += loss.item() * len(labels)
if batch_id % 10 == 0 and epoch % 10 == 0:
print(f'Training loss on batch_id {batch_id} : {loss.item()}')
# Validation loss :
val_loss = 0.
with torch.no_grad() :
for batch_id, (images, labels) in enumerate(instances_val_dataloader):
images = images.to(device)
# Prédictions du modèle :
predictions = detr(images)
# Loss :
indices_affectations = bipartite_matcher.forward(predictions, labels)
loss = hungarian_loss(predictions, labels, indices_affectations, num_classes=NB_INSTANCES_CLASSES, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device=device, downweight_empty=DOWNWEIGHT_EMPTY)
val_loss += loss.item() * len(labels)
validation_losses.append(val_loss / NB_VAL_IMAGES)
training_losses.append(training_loss / NB_TRAIN_IMAGES)
print(f'=> Training loss at epoch {epoch} :', training_losses[-1])
print(f'=> Validation loss at end of epoch {epoch} :', validation_losses[-1])
epochs = [u for u in range(NB_EPOCHS)]
plt.plot(epochs, validation_losses, label = 'Validation Losses', color = 'orange')
plt.plot(epochs, training_losses, label = 'Training Losses', color = 'blue')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
torch.save(obj=detr.state_dict(), f="detr_models/detr_{}epochs_{}batchsize_{}lr.pt".format(NB_EPOCHS, BATCH_SIZE, LEARNING_RATE))
In the FB demos, I realized that in practice, outside of the training procedure, the predictions was kept only if their probability was superior to a certain proba, 0.7 in their examples.
detr = DETR(num_classes=NB_INSTANCES_CLASSES, retrain_resnet=True, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6, N = 100,device=device)
torch.load_state_dict(torch.load("detr_models/detr_100epochs_32batchsize.pt"))
instances_val = CocoDetection_diy_bis(root = data_path_val, annFile = labels_path_val, size=(224,224))
instances_val_dataloader = DataLoader(instances_val, batch_size=5, shuffle=True, collate_fn = collate_fn_diy)
images,labels = next(iter(instances_val_dataloader))
with torch.no_grad() :
predictions = detr(images.cuda())
for i in range(5) :
plot_bounding_boxes_predictions(images, predictions,i, keep_pred = 0)
plot_bounding_boxes(images,labels,i)
images,labels = next(iter(instances_val_dataloader))
with torch.no_grad() :
predictions = detr(images.cuda())
for i in range(5) :
plot_bounding_boxes_predictions(images, predictions,i, keep_pred = 0)
plot_bounding_boxes(images,labels,i)
images,labels = next(iter(instances_val_dataloader))
with torch.no_grad() :
predictions = detr(images.cuda())
for i in range(5) :
plot_bounding_boxes_predictions(images, predictions,i, keep_pred = 0)
plot_bounding_boxes_predictions(images, predictions,i, keep_pred = 0.6)
plot_bounding_boxes(images,labels,i)
plot_bounding_boxes_predictions(images, predictions,1, keep_pred = 0.8)
plot_bounding_boxes(images,labels,1)
plot_bounding_boxes_predictions(images, predictions,4)
plot_bounding_boxes(images,labels,4)
Let try to use the pretrained version of the DETR available on FB github :
detr_fb = torch.hub.load('facebookresearch/detr:main', 'detr_resnet50', pretrained=True)
We will just have to modify the final prediction head for the categories, as our number of categories is different from theirs. We will froze the other parameters of the model :
detr_fb.class_embed
# for param in detr_fb.parameters():
# param.requires_grad = False
detr_fb.class_embed = nn.Linear(in_features=256, out_features=23)
We now want to re train (only the prediction head part) on our images :
NB_EPOCHS = 250
BATCH_SIZE = 32
NB_INSTANCES_CLASSES = 22
LEARNING_RATE = 0.00001
LAMBDA_CATEGORY = 1
LAMBDA_IOU = 1
LAMBDA_1 = 1
detr_fb = detr_fb.to(device)
bipartite_matcher = Bipartite_Matching(lambda_category=LAMBDA_CATEGORY, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device= device)
optimizer = torch.optim.Adam(detr_fb.parameters(), lr=LEARNING_RATE)
detr_learnable_params = 0
for p in detr_fb.parameters() :
if p.requires_grad :
detr_learnable_params += p.numel()
print('Total number of learnable parameters : ', detr_learnable_params)
for p in detr_fb.parameters() :
assert p.is_cuda
training_losses = []
validation_losses = []
for epoch in tqdm(range(NB_EPOCHS)) :
print("-"*20)
print("Epoch :",epoch)
training_loss = 0.
for batch_id, (images, labels) in enumerate(instances_train_dataloader):
optimizer.zero_grad()
images = images.to(device)
# Prédictions du modèle :
predictions = detr_fb(images)
# Loss :
indices_affectations = bipartite_matcher.forward(predictions, labels)
loss = hungarian_loss(predictions, labels, indices_affectations, num_classes=NB_INSTANCES_CLASSES, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device=device)
# Calcul de gradient :
loss.backward()
# 1 optimization step :
optimizer.step()
# Training loss :
training_loss += loss.item() * len(labels)
if batch_id % 10 == 0 and epoch % 10 == 0:
print(f'Training loss on batch_id {batch_id} : {loss.item()}')
# Validation loss :
val_loss = 0.
with torch.no_grad() :
for batch_id, (images, labels) in enumerate(instances_val_dataloader):
images = images.to(device)
# Prédictions du modèle :
predictions = detr_fb(images)
# Loss :
indices_affectations = bipartite_matcher.forward(predictions, labels)
loss = hungarian_loss(predictions, labels, indices_affectations, num_classes=NB_INSTANCES_CLASSES, lambda_iou=LAMBDA_IOU, lambda_1=LAMBDA_1, device=device)
val_loss += loss.item() * len(labels)
validation_losses.append(val_loss / NB_VAL_IMAGES)
training_losses.append(training_loss / NB_TRAIN_IMAGES)
print(f'=> Training loss at epoch {epoch} :', training_losses[-1])
print(f'=> Validation loss at end of epoch {epoch} :', validation_losses[-1])
epochs = [u for u in range(NB_EPOCHS)]
plt.plot(epochs, validation_losses, label = 'Validation Losses', color = 'orange')
plt.plot(epochs, training_losses, label = 'Training Losses', color = 'blue')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
torch.save(obj=detr_fb.state_dict(), f="detr_models/detr_FB_{}epochs_{}batchsize_{}lr.pt".format(NB_EPOCHS, BATCH_SIZE, LEARNING_RATE))
instances_val = CocoDetection_diy_bis(root = data_path_val, annFile = labels_path_val, size=(224,224))
instances_val_dataloader = DataLoader(instances_val, batch_size=20, shuffle=False, collate_fn = collate_fn_diy)
images,labels = next(iter(instances_val_dataloader))
with torch.no_grad() :
predictions = detr_fb(images.cuda())
for i in range(20) :
plot_bounding_boxes_predictions(images, predictions,i, keep_pred = 0)
plot_bounding_boxes_predictions(images, predictions,i, keep_pred = 0.6)
plot_bounding_boxes(images,labels,i)
def bbox_cxcywh_to_xyxy(x):
# Fonction reprise du git FB : renvoie la bbox sous forme de deux points,
# qui correspondent au coin inférieur gauche et supérieur droit
x_c, y_c, w, h = x.unbind(-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=-1)
def IoU (b1, b2) :
"""
Parameters :
-----------
b1, b2 : tensor qui contient des bounding boxes au format [c_x, c_y, w, h] (size : batch_size x num_queries (N) x 4)
b1 correspond aux ground truth et b2 aux prédictions associées
Output :
-------
Matrice de taille len(b1) x len(b2), qui contient dans la case i,j la loss_iou entre b1_i, b2_j
"""
# b1,b2 = b1.cuda(), b2.cuda()
area1, area2 = b1[:,2]*b1[:,3], b2[:,2]*b2[:,3]
b1_xyxy, b2_xyxy = bbox_cxcywh_to_xyxy(b1), bbox_cxcywh_to_xyxy(b2)
# On garde le plus grand coin inf gauche, et le plus petit coin sur droit :
max_inf_gauche = torch.max(b1_xyxy[:, :2], b2_xyxy[:, :2])
min_inf_gauche = torch.min(b1_xyxy[:,:2], b2_xyxy[:, :2])
max_sup_droit = torch.max(b1_xyxy[:, 2:], b2_xyxy[:, 2:])
min_sup_droit = torch.min(b1_xyxy[:, 2:], b2_xyxy[:, 2:])
# intersection area :
# correspond à l'aire de la boite [max_inf_gauche,min_sup_droit] = [(0,0) , max(min_sup_droit - max_inf_gauche ; (0,0))]
inter_sup_droit_normalise = (min_sup_droit - max_inf_gauche).clamp(min=0) # si b1 et b2 ont une intersection vide, on aura min_sup_droit - max_inf_gauche < 0, et le clamp permet de fixer l'intersection à 0
inter_area = inter_sup_droit_normalise[:, 0] * inter_sup_droit_normalise[:, 1]
# union :
# on utilise la formule aire_union = aire_1 + aire_2 - aire_inter
union_area = area1 + area2 - inter_area
# On a désormais tous les ingrédients pour calculer L_iou :
iou = inter_area/union_area
return iou
The following function compute the AP with different thresholds, supposing that the given arguments are corresponding to one category (they should contain all the boxes and logit predicted for one category among all the validation set, the "true" labels corresponding to "True" if the ground truth object is correct and "False" otherwise) :
from sklearn.metrics import precision_score,recall_score
def AP(label_true, boxes, pred_boxes) :
# all arguments are of the same size M, they contain all the predictions for one class among the validation set
thresholds = np.arange(start=0.2, stop=0.7, step=0.05)
# Compute IoU between the boxes :
iou = IoU(boxes, pred_boxes).numpy()
# Compute precision/Recall curve :
precisions, recalls = [], []
for k in thresholds :
true_false = ['True' if i >= k else 'False' for i in iou] # Array containing True if IoU >= k else False
precision = precision_score(y_true=label_true, y_pred=true_false, pos_label='True',zero_division=0)
recall = recall_score(y_true=label_true, y_pred=true_false, pos_label='True',zero_division=0)
precisions.append(precision)
recalls.append(recall)
precisions.append(1.)
recalls.append(0.)
precisions, recalls = np.array(precisions), np.array(recalls)
return np.sum((recalls[:-1] - recalls[1:]) * precisions[:-1])
def mAP (boxes, labels, pred_boxes, pred_labels, nb_classes=23) :
# We first have to select, for each class n, the predictions for which pred_labels == n :
label_true_n = {i : [] for i in range(nb_classes)}
boxes_n = {i : [] for i in range(23)}
pred_boxes_n = {i : [] for i in range(23)}
for k,n in enumerate(pred_labels) :
boxes_n[n].append(boxes[k])
pred_boxes_n[n].append(pred_boxes[k])
label_true_n[n].append(str(labels[k]==n))
# We want to compute the AP score for each class n :
AP_n = []
for n in range(nb_classes) :
lb_true, bx_true, pred_bx = label_true_n[n],boxes_n[n],pred_boxes_n[n]
if len(bx_true) == 0 :
AP_n.append(0)
else :
AP_n.append(AP(lb_true, torch.from_numpy(np.array(bx_true)).float(), torch.from_numpy(np.array(pred_bx)).float()))
AP_n = AP_n[:-1]
return AP_n, np.mean(AP_n)
import pandas as pd
def mAP_detr(detr_model, bipartite_matcher, validation_dataloader) :
# We need to store all the labels and boxes of the predicitons and corresponding gt :
boxes_, labels_, boxes_pred_, labels_pred_ = [], [], [], []
for batch_id, (images, labels) in enumerate(instances_val_dataloader):
boxes_gt, labels_gt = labels[0]['boxes'], labels[0]['labels']
with torch.no_grad() :
predictions = detr_model(images.cuda())
pred_labels_probas, pred_boxes = predictions['pred_logits'][0].cpu().softmax(-1).numpy(), predictions['pred_boxes'][0].cpu().numpy()
pred_labels = pred_labels_probas.argmax(-1)
# Matching gt / predictions :
indices_affectations = bipartite_matcher.forward(predictions, labels) # list of len 1, containing tuples (i,j) with i prediction and j ground truth
i,j = indices_affectations[0]
i,j = i.numpy(), j.numpy()
for k in range(len(i)) :
if len(boxes_) == 0 :
boxes_ = np.expand_dims(boxes_gt[i[k]],axis=0)
boxes_pred_ = np.expand_dims(pred_boxes[j[k]],axis=0)
else :
boxes_ = np.concatenate((boxes_,np.expand_dims(boxes_gt[i[k]],axis=0)))
boxes_pred_ = np.concatenate((boxes_pred_,np.expand_dims(pred_boxes[j[k]],axis=0)))
labels_.append(labels_gt[i[k]].item())
labels_pred_.append(pred_labels[j[k]])
# For the predictions of non empty objects with no correspondance, we associate label_gt == 22 et boxex_gt = [0,0,0,0]
for k in range(len(pred_labels)) :
if (pred_labels[k]!=22) and (k not in j) :
labels_.append(22)
labels_pred_.append(pred_labels[k])
boxes_ = np.concatenate((boxes_,np.expand_dims(np.array([0,0,0,0]),axis=0)))
boxes_pred_ = np.concatenate((boxes_pred_,np.expand_dims(pred_boxes[k],axis=0)))
# Computation of mAP :
return mAP(boxes_, labels_, boxes_pred_, labels_pred_)
instances_val_1 = CocoDetection_diy_bis(root = data_path_val, annFile = labels_path_val, size=(224,224))
instances_val_dataloader_1 = DataLoader(instances_val, batch_size=1, shuffle=True, collate_fn = collate_fn_diy)
# On FB detr with prediction head only re trained on 50 epochs :
a,b = mAP_detr(detr_fb, bipartite_matcher, instances_val)
b
# On detr trained on 250 epochs with training of the resnet :
a,b = mAP_detr(detr_fb, bipartite_matcher, instances_val)
b
df_fb = pd.DataFrame({'classes' : INSTANCES_CLASSES, 'AP' : a})
df_fb['model'] = 'FB'
df_fb
df_fb = df_fb.sort_values('classes')
df_fb.plot(x='classes', y='AP', kind='barh', color='crimson')
# On detr trained on 250 epochs with training of the resnet :
a_,b_ = mAP_detr(detr, bipartite_matcher, instances_val)
b_
df = pd.DataFrame({'classes' : INSTANCES_CLASSES, 'AP' : a_})
df['model'] = 'DIY'
df
df = df.sort_values('classes')
df.plot(x='classes', y='AP', kind='barh',color='firebrick')
fig,ax = plt.subplots(1,2,figsize=(8,5),sharey=True, sharex=True)
df.plot(x='classes', y='AP', kind='barh',color='crimson',ax=ax[0], legend='False')
df_fb.plot(x='classes', y='AP', kind='barh', color='crimson',ax=ax[1], legend='False')